This is a simple presentation of my code for this project.
Thank you to Jose Macias for his help!
# About this script# Students, you will use this script to help you follow along the data workshop. # Each script is named for the city you will explore and use. This script is for San Diego, California.# Libraries to uselibrary(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Linking to GEOS 3.13.1, GDAL 3.10.2, PROJ 9.5.1; sf_use_s2() is TRUE
library(scales)
Attaching package: 'scales'
The following object is masked from 'package:purrr':
discard
The following object is masked from 'package:readr':
col_factor
# Read the csv in and generate new columnssd_crime <-read_csv("sd_crime.csv")
Rows: 76990 Columns: 25
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (16): nibrs_uniq, code_section, group_type, ibr_category, crime_against...
dbl (8): case_number, violent_crime, property_crime, beat, service_area, d...
dttm (1): occured_on
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `date_time = ymd_hms(occured_on)`.
Caused by warning:
! 7 failed to parse.
# Read in affordable housing csvsd_afh <-read_csv("sd_afh.csv") %>%mutate(GEOID =as.character(GEOID))
Rows: 399 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (8): PROPERTY_NAME, ADDRESS, GROUP_SERVICED, GEOID, Block, Tract, County...
dbl (4): TOTAL_UNITS, AFFORDABLE_UNITS, Long, Lat
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Read in San Diego shapefile for tractssd_tracts <-st_read("sd_tracts.shp")
Reading layer `sd_tracts' from data source
`C:\Users\harol\Documents\sd_tracts.shp' using driver `ESRI Shapefile'
Simple feature collection with 1040 features and 6 fields
Geometry type: POLYGON
Dimension: XY
Bounding box: xmin: -117.2822 ymin: 32.53444 xmax: -116.423 ymax: 33.16414
Geodetic CRS: WGS 84
sd_tracts %>%mutate(GEOID =as.character(GEOID))
Simple feature collection with 1040 features and 6 fields
Geometry type: POLYGON
Dimension: XY
Bounding box: xmin: -117.2822 ymin: 32.53444 xmax: -116.423 ymax: 33.16414
Geodetic CRS: WGS 84
First 10 features:
GEOID Block Tract County State
1 060730073032 Block Group 2 Census Tract 73.03 San Diego County California
2 060730083551 Block Group 1 Census Tract 83.55 San Diego County California
3 060730214021 Block Group 1 Census Tract 214.02 San Diego County California
4 060730075013 Block Group 3 Census Tract 75.01 San Diego County California
5 060730054011 Block Group 1 Census Tract 54.01 San Diego County California
6 060730039013 Block Group 3 Census Tract 39.01 San Diego County California
7 060730083272 Block Group 2 Census Tract 83.27 San Diego County California
8 060730045012 Block Group 2 Census Tract 45.01 San Diego County California
9 060730083433 Block Group 3 Census Tract 83.43 San Diego County California
10 060730093053 Block Group 3 Census Tract 93.05 San Diego County California
hshld__ geometry
1 93030 POLYGON ((-117.2551 32.7414...
2 114821 POLYGON ((-117.1357 32.9242...
3 83409 POLYGON ((-117.2306 32.7372...
4 58808 POLYGON ((-117.2471 32.7514...
5 NA POLYGON ((-117.161 32.70938...
6 43043 POLYGON ((-117.1339 32.7032...
7 172109 POLYGON ((-117.2344 32.9642...
8 79844 POLYGON ((-117.1403 32.7194...
9 72014 POLYGON ((-117.2165 32.8664...
10 142173 POLYGON ((-117.1335 32.7858...
Phase 1: Data Wrangling & Analysis of Crime Trends
Task 1: Overall Crime Baseline
# Calculate the total number of crimes per yearsd_crime %>%group_by(year) %>%summarise(total_crime =n())
# A tibble: 6 × 2
year total_crime
<dbl> <int>
1 2020 18634
2 2021 18768
3 2022 17612
4 2023 16874
5 2024 5095
6 NA 7
sd_crime %>%group_by(year) %>%summarise(total_crime =n()) %>%ggplot(aes(x = year, y = total_crime)) +geom_line() +labs(title ="Figure 1: Crime in San Diego, 2020-2024", x ="Year", y ="Total Crime") +theme_minimal()
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_line()`).
# A tibble: 26 × 2
crime_type total_crime
<fct> <int>
1 All Other Offenses 16166
2 Larceny 9248
3 Vandalism 6966
4 Simple Assault 6613
5 Drug/Narcotic Violations 6344
6 Theft From Motor Vehicle 5955
7 Motor Vehicle Theft 5638
8 Aggravated Assault 3759
9 Alcohol Violations 3679
10 Fraud/Counterfeiting 3641
# ℹ 16 more rows
Bar Plot of Top 15 Crime Types
sd_crime %>%group_by(crime_type) %>%summarise(total_crime =n()) %>%arrange(-total_crime) %>%head(15) %>%ggplot(aes(x = total_crime, y = crime_type)) +geom_col() +labs(title ="Figure 2: Crime in San Diego by Top 15 Categories, 2020-2024", x ="Total Crimes", y ="Crime Type") +theme_minimal()
Area 2: Crime Seasonal Fluctuations
Bar Plot of Crime by Season
season_a_crime <- sd_crime %>%filter(!is.na(season)) %>%group_by(season) %>%summarise(total_crime =n(), .groups ="drop") %>%ggplot(aes(x = season, y = total_crime)) +geom_col() +labs(title ="Figure 3: Crime in San Diego by Season, 2020–2024",x ="Season", y ="Total Crimes") +theme_minimal()# Ensure it's printedprint(season_a_crime)
`summarise()` has grouped output by 'season'. You can override using the
`.groups` argument.
winter <-seasonal_crime(sd_crime, "Winter")
`summarise()` has grouped output by 'season'. You can override using the
`.groups` argument.
spring <-seasonal_crime(sd_crime, "Spring")
`summarise()` has grouped output by 'season'. You can override using the
`.groups` argument.
summer <-seasonal_crime(sd_crime, "Summer")
`summarise()` has grouped output by 'season'. You can override using the
`.groups` argument.
full_join(winter, fall) %>%full_join(spring) %>%full_join(summer) %>%ggplot(aes(x = crime_type, y = total_crime)) +geom_col() +labs(title ="Figure 4: Crime in San Diego by Season, 2020-2024", x ="Season", y ="Total Crimes") +theme_minimal() +coord_flip() +facet_wrap(~season, scales ="free")
Joining with `by = join_by(season, crime_type, total_crime)`
Joining with `by = join_by(season, crime_type, total_crime)`
Joining with `by = join_by(season, crime_type, total_crime)`
Area 4: Crime Distribution by Geography
Crime by County
sd_crime %>%filter(!is.na(County)) %>%group_by(County) %>%summarise(total_crime =n()) %>%ggplot(aes(x = County, y = total_crime)) +geom_col(fill ="steelblue") +labs(title ="Figure 5: Crime in San Diego by County, 2020–2024", x ="County", y ="Total Crimes") +theme_minimal()
Crime by Block
sd_crime %>%group_by(Block) %>%summarise(total_crime =n()) %>%ggplot(aes(x = Block, y = total_crime)) +geom_col(fill ="steelblue") +labs(title ="Figure 6: Crime in San Diego by Block, 2020–2024", x ="Block", y ="Total Crimes") +theme_minimal()
Crime by Neighborhood
sd_crime %>%filter(!is.na(neighborhood)) %>%group_by(neighborhood) %>%summarise(total_crime =n()) %>%ggplot(aes(x = neighborhood, y = total_crime)) +geom_col(fill ="steelblue") +labs(title ="Figure 7: Crime in San Diego by Neighborhood, 2020–2024", x ="Neighborhood", y ="Total Crimes") +theme_minimal()
Area 5: When Do Crimes Occur?
Crime by Weekday
sd_crime %>%filter(!is.na(weekday)) %>%group_by(weekday) %>%summarise(total_crime =n()) %>%ggplot(aes(x = weekday, y = total_crime)) +geom_col(fill ="steelblue") +labs(title ="Figure 8: Crime in San Diego by Weekday, 2020–2024", x ="Weekday", y ="Total Crimes") +theme_minimal()
Crime by Hour of the Day
sd_crime %>%mutate(hour_local =hour(with_tz(date_time, tzone ="America/Chicago"))) %>%group_by(hour_local) %>%summarise(total_crime =n(), .groups ="drop") %>%ggplot(aes(x = hour_local, y = total_crime)) +geom_col(fill ="steelblue") +labs(title ="Figure 9: Crime in San Diego by Hour, 2020–2024", x ="Hour of the Day", y ="Total Crimes") +theme_minimal()
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_col()`).
sd_afh %>%filter(!is.na(PROPERTY_NAME)) %>%group_by(Block) %>%summarise(TOTAL_UNITS =sum(TOTAL_UNITS, na.rm =TRUE)) %>%ggplot(aes(x = Block, y = TOTAL_UNITS)) +geom_col(fill ="steelblue") +labs(title ="Figure 10: Affordable Housing in San Diego by Block, 2020–2024", x ="Block Group", y ="Total Units") +theme_minimal()
Phase 3: Mapping Crime, Affordable Housing & Income
Map of Crime, Housing, and Income
# Task 3: From you analysis, lets visualize crime, affordable housing and income distribution to understand further what is going on# Area 1) Load leaflet and city tracts# library(leaflet)# library(leaflet.extras)# library(sf)
top8 <-as.character(sd_crime %>%group_by(crime_type) %>%summarize(total =n()) %>%arrange(desc(total)) %>%head(8) %>%pull(crime_type))# defining colors according to the top eight crimespal <-colorFactor(palette ='Dark2' ,domain = top8)
# defining a color pallet for the income variablespal_income <-colorBin("RdYlBu", domain = sd_tracts$hshld__, bins =10)# Create a new palette function using colorNumeric and a Brewer palettepal_income <-colorNumeric(palette ="YlGnBu", # Choose from RColorBrewer or use customdomain = sd_tracts$hshld__, na.color ="transparent")
Visualising Crime Locations
# Area 2) Pick a year to analyze and create a map where you plot crime locations using the latitude and longitude, select a subset of crimes to visualize otherwise, you might find your map hard communicate information. sd_crime %>%filter(crime_type %in% top8, year ==2021) %>%leaflet() %>%addProviderTiles("CartoDB") %>%addPolygons(data = sd_tracts,weight =2,opacity =0.2,dashArray ='3',color ="black",fillColor =~pal_income(hshld__),label =~paste0("Median Income: ", dollar(hshld__)),popup =~paste0("Census Tract: ", Tract, "<br>","Median Income: ", dollar(hshld__)),fillOpacity =0.7,highlightOptions =highlightOptions(weight =3,color ="#666",fillOpacity =0.9), ,group ="Income Distribution" ) %>%addCircleMarkers(lng =~lng, lat =~lat,radius =2,color =~pal(crime_type),label =~ crime_type) %>%addCircleMarkers(data = sd_afh,lng =~Long,lat =~Lat,radius =~sqrt(TOTAL_UNITS)/2,color ="#00CCFF",label =~PROPERTY_NAME) %>%addLegend(pal = pal, values = top8, title ="Crime Type",opacity =1, position ="bottomright" ) %>%addLegend(data = sd_tracts,pal = pal_income, values =~hshld__,opacity =0.7, title ="Income Distribution",position ="bottomright" )
# make a heatmapsd_crime %>%filter(crime_type %in% top8, year ==2021) %>%leaflet() %>%addProviderTiles("CartoDB") %>%addCircleMarkers(data = sd_afh,lng =~Long,lat =~Lat,radius =~sqrt(TOTAL_UNITS)/2,color ="black",label =~PROPERTY_NAME,group ="Affordable Housing") %>%addHeatmap(lng =~lng, lat =~lat,blur =20, max =0.05, radius =15,group ="Heat Map" ) %>%addLayersControl(overlayGroups =c("Affordable Housing","Heat Map"),options =layersControlOptions(collapsed =TRUE) )